Java 程式碼 << Previous Next >> C或C++程式碼
Python 程式碼

def parse_content():
    """use bs4 and re module functions to parse content.htm"""
    #from pybean import Store, SQLiteWriter
    # if no content.db, create database file with cms table
    '''
    if not os.path.isfile(config_dir+"content.db"):
        library = Store(SQLiteWriter(config_dir+"content.db", frozen=False))
        cms = library.new("cms")
        cms.follow = 0
        cms.title = "head 1"
        cms.content = "content 1"
        cms.memo = "first memo"
        library.save(cms)
        library.commit()
    '''
    # if no content.htm, generate a head 1 and content 1 file
    if not os.path.isfile(config_dir+"content.htm"):
        # create content.htm if there is no content.htm
        File = open(config_dir + "content.htm", "w", encoding="utf-8")
        File.write("<h1>head 1</h1>content 1")
        File.close()
    subject = file_get_contents(config_dir+"content.htm")
    # deal with content without content
    if subject == "":
        # create content.htm if there is no content.htm
        File = open(config_dir + "content.htm", "w", encoding="utf-8")
        File.write("<h1>head 1</h1>content 1")
        File.close()
        subject = "<h1>head 1</h1>content 1"
    # initialize the return lists
    head_list = []
    level_list = []
    page_list = []
    # make the soup out of the html content
    soup = bs4.BeautifulSoup(subject, 'html.parser')
    # 嘗試解讀各種情況下的標題
    soup = _remove_h123_attrs(soup)
    # 改寫 content.htm 後重新取 subject
    with open(config_dir + "content.htm", "wb") as f:
        f.write(soup.encode("utf-8"))
    subject = file_get_contents(config_dir+"content.htm")
    # get all h1, h2, h3 tags into list
    htag= soup.find_all(['h1', 'h2', 'h3'])
    n = len(htag)
    # get the page content to split subject using each h tag
    temp_data = subject.split(str(htag[0]))
    if len(temp_data) > 2:
        subject = str(htag[0]).join(temp_data[1:])
    else:
        subject = temp_data[1]
    if n >1:
            # i from 1 to i-1
            for i in range(1, len(htag)):
                head_list.append(htag[i-1].text.strip())
                # use name attribute of h* tag to get h1, h2 or h3
                # the number of h1, h2 or h3 is the level of page menu
                level_list.append(htag[i-1].name[1])
                temp_data = subject.split(str(htag[i]))
                if len(temp_data) > 2:
                    subject = str(htag[i]).join(temp_data[1:])
                else:
                    subject = temp_data[1]
                # cut the other page content out of htag from 1 to i-1
                cut = temp_data[0]
                # add the page content
                page_list.append(cut)
    # last i
    # add the last page title
    head_list.append(htag[n-1].text.strip())
    # add the last level
    level_list.append(htag[n-1].name[1])
    temp_data = subject.split(str(htag[n-1]))
    # the last subject
    subject = temp_data[0]
    # cut the last page content out
    cut = temp_data[0]
    # the last page content
    page_list.append(cut)
    return head_list, level_list, page_list
Java 程式碼 << Previous Next >> C或C++程式碼